library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)
options(
tigris_class = "sf",
tigris_use_cache = TRUE
)
Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")
Here I obtain various demographic data, including income (percent below 50% and 80% of area median income), vehicle ownership, age, English language ability, and occupants per room.
# obtain the saved census data
setwd("~/Documents/2020 Spring Quarter/CEE 218Z")
acs_vars = readRDS("censusData2018_acs_acs5.rds")
setwd("~/Documents/2020 Spring Quarter/CEE 218Z/covid19")
# load in income data - code adapted from other students
sj_median_income_by_block <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "B19013_001E"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
rename(
Median_Income = B19013_001E
) %>%
filter(!is.na(Median_Income)) %>%
left_join(sj_blockgroups, by = c("blockgroup" = "GEOID")) %>% #this code gives each blockgroup a district designation
filter(
!is.na(DISTRICTS)
) %>%
# this code joins our census data with the social distancing data, processed as shown below
left_join(sj_socialdistancing %>%
filter(weekend == F) %>%
filter(date > shelter_start) %>%
group_by(origin_census_block_group) %>%
summarize(
completely_home_device_count = sum(completely_home_device_count),
device_count = sum(device_count)) %>%
mutate(`% Completely at Home` = (completely_home_device_count/device_count*100) %>% round(1)),
by = c("blockgroup" = "origin_census_block_group")
) %>%
filter(
!is.na(device_count)
) %>%
left_join(sj_pre_sd_at_home_average %>% dplyr::select(origin_census_block_group, `% Completely at Home Pre Shelter`), by = c("blockgroup" = "origin_census_block_group"))
sj_ami_by_block <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B19001)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
group_by(blockgroup) %>%
summarize(
Total = B19001_001E,
`Under 75,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E),
#sum(lapply(2:12, function(x) as.name(paste0("B19001_00",x,"E"))))
`Under 100,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E, B19001_013E)
) %>%
mutate(
`% under 75,000` = `Under 75,000` / Total * 100,
`% under 100,000` = `Under 100,000` / Total * 100
) %>%
left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)
) %>%
filter(!is.na(device_count))
# loading in language data - code adapted from other students
sj_lang_by_block <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B16004)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(
key = "variable",
value = "estimate",
- blockgroup
) %>%
left_join(acs_vars, by = c("variable" = "name")) %>%
mutate(
tier = substr(label,lapply(label, function(x) max(unlist(gregexpr('!!',x)))+2),nchar(label))
) %>%
filter(tier %in% c('Speak English "not well"',
'Speak English "not at all"',
'Total', 'Speak Spanish',
'Speak Asian and Pacific Island languages')) %>%
group_by(blockgroup, tier) %>%
summarise(
estimate1 = sum(estimate)
) %>%
spread(
key = "tier",
value = "estimate1"
) %>%
mutate(
`% speaking english < well` = (`Speak English "not well"` + `Speak English "not at all"`) / Total * 100,
`% speaking spanish` = (`Speak Spanish`/ Total) * 100,
`% speaking api` = (`Speak Asian and Pacific Island languages` / Total) * 100
) %>%
left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)) %>%
filter(!is.na(device_count)) %>%
mutate(log_perc = log(`% speaking english < well`))
# loading in age data - specifically looking at percentage 65+ and percentage <30
sj_age_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B01001)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(
key = "variable",
value = "estimate",
- blockgroup
) %>%
mutate(
label = acs_vars$label[match(variable,acs_vars$name)]
) %>%
select(-variable) %>%
separate(
label,
into = c(NA,NA,"sex","age"),
sep = "!!"
) %>% filter(!is.na(age)) %>%
mutate(elderly = ifelse(age %in% c("65 and 66 years", "67 to 69 years", "70 to 74 years", "75 to 79 years", "80 to 84 years", "85 years and over"), estimate, NA), `less than 30` = ifelse(age %in% c("Under 5 years", "5 to 9 years", "10 to 14 years", "15 to 17 years", "18 and 19 years", "20 years", "21 years", "22 to 24 years", "25 to 29 years"), estimate, NA)) %>%
group_by(blockgroup) %>%
summarize(elderly = sum(elderly, na.rm = T), `less than 30` = sum(`less than 30`, na.rm = T), total = sum(estimate, na.rm = T)) %>%
mutate(`percent elderly` = elderly*100 / total, `percent less than 30` = `less than 30`*100 / total) %>%
left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)) %>%
filter(!is.na(device_count))
# get data on vehicles available as vehicles allocation
sj_vehicles_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B992512)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
dplyr::select(B992512_001E, blockgroup) %>%
rename(total_vehicles = B992512_001E, blockgroup = blockgroup) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
mutate(`vehicles per capita` = total_vehicles / total) %>%
filter(!is.na(device_count))
# also get data on vehicles available as households without a vehicle
sj_no_vehicles_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B25044)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, NA,"vehicles"), sep = "!!") %>%
filter(!is.na(vehicles)) %>%
group_by(blockgroup, vehicles) %>%
summarize(grouped_vehicles = sum(estimate)) %>%
spread(key = vehicles, value = grouped_vehicles) %>%
mutate(total_nums = `1 vehicle available` + `2 vehicles available` + `3 vehicles available` + `4 vehicles available` + `5 or more vehicles available` + `No vehicle available`, `percent no vehicles` = `No vehicle available`*100 / total_nums, `log percent` = log(`percent no vehicles`)) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
# get data on occupants per room
sj_occupants_per_room_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B25014)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, NA,"occupants per room"), sep = "!!") %>%
filter(!is.na(`occupants per room`)) %>%
group_by(blockgroup, `occupants per room`) %>%
summarize(estimate_tot = sum(estimate)) %>%
spread(key = `occupants per room`, value = estimate_tot) %>%
mutate(total_nums = `0.50 or less occupants per room` + `0.51 to 1.00 occupants per room` + `1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`, `percent 1 or more` = (`1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`) * 100/ total_nums) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
In the plots below, I show the selected variables against percent of devices completely at home since the shelter-in-place order started, as well as against percent of devices pre-shelter-in-place for comparison.
Age:
# age
sj_age_by_block %>%
ggplot(aes(
x = `percent less than 30`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of residents younger than 30",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Young Age Groups"
)
young_model <- lm(sj_age_by_block$`% Completely at Home` ~ sj_age_by_block$`percent less than 30`)
summary(young_model)
##
## Call:
## lm(formula = sj_age_by_block$`% Completely at Home` ~ sj_age_by_block$`percent less than 30`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.272 -4.473 0.233 4.833 28.159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.47212 1.52328 37.729 < 2e-16 ***
## sj_age_by_block$`percent less than 30` -0.20725 0.03861 -5.367 1.17e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.352 on 567 degrees of freedom
## Multiple R-squared: 0.04835, Adjusted R-squared: 0.04667
## F-statistic: 28.81 on 1 and 567 DF, p-value: 1.167e-07
sj_age_by_block %>% filter(`percent elderly` < 50) %>% # get rid of extreme outliers
ggplot(aes(
x = `percent elderly`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of residents ages 65 and older",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Elderly Population"
)
elderly_model <- lm(`% Completely at Home` ~ `percent elderly`, sj_age_by_block %>% filter(`percent elderly` < 50))
summary(elderly_model)
##
## Call:
## lm(formula = `% Completely at Home` ~ `percent elderly`, data = sj_age_by_block %>%
## filter(`percent elderly` < 50))
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.632 -4.680 0.259 5.108 28.119
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.02191 0.79188 59.38 < 2e-16 ***
## `percent elderly` 0.19522 0.05468 3.57 0.000387 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.456 on 564 degrees of freedom
## Multiple R-squared: 0.0221, Adjusted R-squared: 0.02037
## F-statistic: 12.75 on 1 and 564 DF, p-value: 0.0003869
# compare this to pre-shelter-in-place behavior
sj_age_by_block %>%
ggplot(aes(
x = `percent less than 30`,
y = `% Completely at Home Pre Shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of residents younger than 30",
y = "Percent devices completely at home pre-shelter-in-place",
title = "San Jose: Staying at Home and Young Age Groups Pre Shelter-in-Place"
)
young_model2 <- lm(sj_age_by_block$`% Completely at Home Pre Shelter` ~ sj_age_by_block$`percent less than 30`)
summary(young_model2)
##
## Call:
## lm(formula = sj_age_by_block$`% Completely at Home Pre Shelter` ~
## sj_age_by_block$`percent less than 30`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.9202 -2.9456 0.1802 2.4524 27.0472
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.49842 0.77360 22.620 < 2e-16 ***
## sj_age_by_block$`percent less than 30` 0.09199 0.01961 4.691 3.41e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.242 on 567 degrees of freedom
## Multiple R-squared: 0.03736, Adjusted R-squared: 0.03566
## F-statistic: 22.01 on 1 and 567 DF, p-value: 3.41e-06
sj_age_by_block %>% filter(`percent elderly` < 50) %>% # get rid of extreme outliers
ggplot(aes(
x = `percent elderly`,
y = `% Completely at Home Pre Shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of residents ages 65 and older",
y = "Percent devices completely at home on weekdays pre-shelter-in-place",
title = "San Jose: Staying at Home and Elderly Population Pre Shelter-in-Place"
)
elderly_model2 <- lm(`% Completely at Home Pre Shelter` ~ `percent elderly`, sj_age_by_block %>% filter(`percent elderly` < 50))
summary(elderly_model2)
##
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `percent elderly`,
## data = sj_age_by_block %>% filter(`percent elderly` < 50))
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.2215 -3.0147 0.1708 2.5173 26.2336
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.55815 0.39776 56.713 < 2e-16 ***
## `percent elderly` -0.11900 0.02746 -4.333 1.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.247 on 564 degrees of freedom
## Multiple R-squared: 0.03222, Adjusted R-squared: 0.0305
## F-statistic: 18.77 on 1 and 564 DF, p-value: 1.742e-05
Income:
# income - less than $75000
sj_ami_by_block %>%
ggplot(aes(
x = `% under 75,000`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Households Below 50% AMI"
)
income_75_model <- lm(`% Completely at Home` ~ `% under 75,000`, sj_ami_by_block)
summary(income_75_model)
##
## Call:
## lm(formula = `% Completely at Home` ~ `% under 75,000`, data = sj_ami_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.789 -4.168 0.541 4.649 20.690
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.9890 0.7199 80.55 <2e-16 ***
## `% under 75,000` -0.2233 0.0172 -12.99 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.447 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2296, Adjusted R-squared: 0.2282
## F-statistic: 168.7 on 1 and 566 DF, p-value: < 2.2e-16
# income - less than $100000
sj_ami_by_block %>%
ggplot(aes(
x = `% under 100,000`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes under $100,000 (80% AMI) annually",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Households Below 80% AMI"
)
income_100_model <- lm(`% Completely at Home` ~ `% under 100,000`, sj_ami_by_block)
summary(income_100_model)
##
## Call:
## lm(formula = `% Completely at Home` ~ `% under 100,000`, data = sj_ami_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.026 -4.007 0.736 4.507 20.120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.37827 0.83815 72.04 <2e-16 ***
## `% under 100,000` -0.22077 0.01592 -13.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.33 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2523
## F-statistic: 192.3 on 1 and 566 DF, p-value: < 2.2e-16
# compare to pre shelter in place
sj_ami_by_block %>%
ggplot(aes(
x = `% under 75,000`,
y = `% Completely at Home Pre Shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
y = "Percent devices completely at home on weekdays pre-shelter-in-place",
title = "San Jose: Staying at Home and Households Below 50% AMI Pre Shelter-in-Place"
)
income_75_model2 <- lm(`% Completely at Home Pre Shelter` ~ `% under 75,000`, sj_ami_by_block)
summary(income_75_model2)
##
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `% under 75,000`,
## data = sj_ami_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.2150 -2.8199 -0.0752 2.4328 27.3258
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.230228 0.397232 45.893 < 2e-16 ***
## `% under 75,000` 0.074344 0.009487 7.836 2.32e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.109 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.09787, Adjusted R-squared: 0.09627
## F-statistic: 61.4 on 1 and 566 DF, p-value: 2.315e-14
# income - less than $100000
sj_ami_by_block %>%
ggplot(aes(
x = `% under 100,000`,
y = `% Completely at Home Pre Shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes under $100,000 (80% AMI) annually",
y = "Percent devices completely at home on weekdays pre-shelter-in-place",
title = "San Jose: Staying Home and Households Below 80% AMI Pre Shelter-in-Place"
)
income_100_model2 <- lm(`% Completely at Home Pre Shelter` ~ `% under 100,000`, sj_ami_by_block)
summary(income_100_model2)
##
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `% under 100,000`,
## data = sj_ami_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.3737 -2.7043 -0.0472 2.4317 27.3698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.253442 0.464232 37.166 <2e-16 ***
## `% under 100,000` 0.077203 0.008818 8.755 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.06 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1193, Adjusted R-squared: 0.1177
## F-statistic: 76.66 on 1 and 566 DF, p-value: < 2.2e-16
Language:
# language
sj_lang_by_block %>%
ggplot(aes(
x = `% speaking english < well`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of individuals speaking English less than well",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and English Language Ability"
)
english_ability_model <- lm(`% Completely at Home` ~ `% speaking english < well`, sj_lang_by_block)
summary(english_ability_model)
##
## Call:
## lm(formula = `% Completely at Home` ~ `% speaking english < well`,
## data = sj_lang_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.388 -3.914 0.397 4.999 24.526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.04583 0.55037 94.566 < 2e-16 ***
## `% speaking english < well` -0.22415 0.03775 -5.938 5.02e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.307 on 567 degrees of freedom
## Multiple R-squared: 0.05855, Adjusted R-squared: 0.05689
## F-statistic: 35.26 on 1 and 567 DF, p-value: 5.02e-09
sj_lang_by_block %>%
ggplot(aes(
x = `% speaking spanish`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of individuals speaking Spanish",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Spanish Language Ability"
)
spanish_speaking_model <- lm(`% Completely at Home` ~ `% speaking spanish`, sj_lang_by_block)
summary(spanish_speaking_model)
##
## Call:
## lm(formula = `% Completely at Home` ~ `% speaking spanish`, data = sj_lang_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.548 -4.051 0.741 4.658 24.705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.32684 0.49214 108.36 <2e-16 ***
## `% speaking spanish` -0.17128 0.01645 -10.41 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.845 on 567 degrees of freedom
## Multiple R-squared: 0.1605, Adjusted R-squared: 0.159
## F-statistic: 108.4 on 1 and 567 DF, p-value: < 2.2e-16
# compare to pre shelter in place
sj_lang_by_block %>%
ggplot(aes(
x = `% speaking english < well`,
y = `% Completely at Home Pre Shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of individuals speaking English less than well",
y = "Percent devices completely at home on weekdays pre-shelter-in-place",
title = "San Jose: Staying at Home and English Language Ability Pre Shelter-in-Place"
)
english_ability_model2 <- lm(`% Completely at Home Pre Shelter` ~ `% speaking english < well`, sj_lang_by_block)
summary(english_ability_model2)
##
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `% speaking english < well`,
## data = sj_lang_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.3112 -2.8560 -0.0598 2.2760 27.7048
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.18999 0.26846 71.481 <2e-16 ***
## `% speaking english < well` 0.16300 0.01841 8.853 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.052 on 567 degrees of freedom
## Multiple R-squared: 0.1214, Adjusted R-squared: 0.1199
## F-statistic: 78.37 on 1 and 567 DF, p-value: < 2.2e-16
sj_lang_by_block %>%
ggplot(aes(
x = `% speaking spanish`,
y = `% Completely at Home Pre Shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of individuals speaking Spanish",
y = "Percent devices completely at home on weekdays pre shelter-in-place",
title = "San Jose: Staying at Home and Spanish Language Ability Pre Shelter-in-Place"
)
spanish_speaking_model2 <- lm(`% Completely at Home Pre Shelter` ~ `% speaking spanish`, sj_lang_by_block)
summary(spanish_speaking_model2)
##
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `% speaking spanish`,
## data = sj_lang_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4706 -2.7537 -0.0822 2.4274 27.6257
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.384808 0.254815 76.07 <2e-16 ***
## `% speaking spanish` 0.073935 0.008518 8.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.062 on 567 degrees of freedom
## Multiple R-squared: 0.1173, Adjusted R-squared: 0.1157
## F-statistic: 75.34 on 1 and 567 DF, p-value: < 2.2e-16
Occupants per room:
# occupants per room
sj_occupants_per_room_by_block %>%
ggplot(aes(
x = `percent 1 or more`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with more than 1 occupant per room",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Room Occupancy"
)
occupants_model <- lm(`% Completely at Home` ~ `percent 1 or more`, sj_occupants_per_room_by_block)
summary(occupants_model)
##
## Call:
## lm(formula = `% Completely at Home` ~ `percent 1 or more`, data = sj_occupants_per_room_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.977 -4.137 0.286 4.849 23.842
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.85791 0.46833 110.73 < 2e-16 ***
## `percent 1 or more` -0.23399 0.03277 -7.14 2.89e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.126 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.08262, Adjusted R-squared: 0.081
## F-statistic: 50.97 on 1 and 566 DF, p-value: 2.885e-12
# compare to pre shelter in place
sj_occupants_per_room_by_block %>%
ggplot(aes(
x = `percent 1 or more`,
y = `% Completely at Home Pre Shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with more than 1 occupant per room",
y = "Percent devices completely at home on weekdays pre shelter-in-place",
title = "San Jose: Staying at Home and Room Occupancy Pre Shelter-in-Place"
)
occupants_model2 <- lm(`% Completely at Home Pre Shelter` ~ `percent 1 or more`, sj_occupants_per_room_by_block)
summary(occupants_model2)
##
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `percent 1 or more`,
## data = sj_occupants_per_room_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.3938 -2.8717 0.0566 2.5198 27.1950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.61616 0.23366 83.952 <2e-16 ***
## `percent 1 or more` 0.14478 0.01635 8.854 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.054 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1217, Adjusted R-squared: 0.1201
## F-statistic: 78.4 on 1 and 566 DF, p-value: < 2.2e-16
Vehicle ownership:
# vehicles
sj_vehicles_by_block %>%
ggplot(aes(
x = `vehicles per capita`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Number of vehicles per capita",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Vehicles Per Capita"
)
# vehicles - percent with no vehicles
sj_no_vehicles_by_block %>%
ggplot(aes(
x = `percent no vehicles`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with no vehicle available",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Vehicle Availability"
)
vehicles_model <- lm(`% Completely at Home` ~ `percent no vehicles`, sj_no_vehicles_by_block)
summary(vehicles_model)
##
## Call:
## lm(formula = `% Completely at Home` ~ `percent no vehicles`,
## data = sj_no_vehicles_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.404 -4.625 0.235 5.096 24.696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.00370 0.43520 117.196 < 2e-16 ***
## `percent no vehicles` -0.29763 0.05439 -5.473 6.67e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.268 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.05025, Adjusted R-squared: 0.04858
## F-statistic: 29.95 on 1 and 566 DF, p-value: 6.672e-08
# compare to pre shelter in place
sj_no_vehicles_by_block %>%
ggplot(aes(
x = `percent no vehicles`,
y = `% Completely at Home Pre Shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with no vehicle available",
y = "Percent devices completely at home on weekdays pre shelter-in-place",
title = "San Jose: Social Distancing and Vehicle Availability Pre Shelter-in-Place"
)
vehicles_model2 <- lm(`% Completely at Home Pre Shelter` ~ `percent no vehicles`, sj_no_vehicles_by_block)
summary(vehicles_model2)
##
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `percent no vehicles`,
## data = sj_no_vehicles_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.8094 -2.9241 -0.0556 2.7766 24.6695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.36142 0.22283 91.378 < 2e-16 ***
## `percent no vehicles` 0.13931 0.02785 5.003 7.55e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.233 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.04235, Adjusted R-squared: 0.04066
## F-statistic: 25.03 on 1 and 566 DF, p-value: 7.55e-07
Multiple regression analysis with income, age, language, and occupants per room
# multiple regression
modeltest <- lm(sj_ami_by_block$`% Completely at Home` ~ sj_ami_by_block$`% under 100,000` + sj_age_by_block$`percent less than 30` + sj_lang_by_block$`% speaking english < well` + sj_occupants_per_room_by_block$`percent 1 or more`)
summary(modeltest)
##
## Call:
## lm(formula = sj_ami_by_block$`% Completely at Home` ~ sj_ami_by_block$`% under 100,000` +
## sj_age_by_block$`percent less than 30` + sj_lang_by_block$`% speaking english < well` +
## sj_occupants_per_room_by_block$`percent 1 or more`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.056 -4.073 0.593 4.495 18.560
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 62.088401 1.629593 38.101
## sj_ami_by_block$`% under 100,000` -0.233430 0.020969 -11.132
## sj_age_by_block$`percent less than 30` -0.050195 0.042165 -1.190
## sj_lang_by_block$`% speaking english < well` 0.068559 0.045802 1.497
## sj_occupants_per_room_by_block$`percent 1 or more` 0.006381 0.046449 0.137
## Pr(>|t|)
## (Intercept) <2e-16 ***
## sj_ami_by_block$`% under 100,000` <2e-16 ***
## sj_age_by_block$`percent less than 30` 0.234
## sj_lang_by_block$`% speaking english < well` 0.135
## sj_occupants_per_room_by_block$`percent 1 or more` 0.891
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.32 on 563 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2595, Adjusted R-squared: 0.2542
## F-statistic: 49.31 on 4 and 563 DF, p-value: < 2.2e-16
I also consider education and internet access, based on previous research. Education:
sj_education_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B15003)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, "education level"), sep = "!!") %>%
mutate(`education level` = replace_na(`education level`, "total_educ")) %>% # if the education level field is NA, this corresponded to the total number in that blockgroup
spread(key = `education level`, value = estimate) %>%
mutate(`percent associates or higher` = (`Associate's degree` + `Bachelor's degree` + `Doctorate degree` + `Master's degree`)*100/total_educ, `percent less than associates` = 100-`percent associates or higher`) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
# plotting
sj_education_by_block %>%
ggplot(aes(
x = `percent less than associates`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of people without an degree at Associate's level or higher",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Education"
)
educ_model <- lm(`% Completely at Home` ~ `percent less than associates`, sj_education_by_block)
summary(educ_model)
##
## Call:
## lm(formula = `% Completely at Home` ~ `percent less than associates`,
## data = sj_education_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.245 -3.826 0.937 4.612 22.749
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.44472 0.92366 65.44 <2e-16 ***
## `percent less than associates` -0.20682 0.01642 -12.60 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.568 on 567 degrees of freedom
## Multiple R-squared: 0.2187, Adjusted R-squared: 0.2174
## F-statistic: 158.7 on 1 and 567 DF, p-value: < 2.2e-16
# compare to pre shelter in place
sj_education_by_block %>%
ggplot(aes(
x = `percent less than associates`,
y = `% Completely at Home Pre Shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of people without an degree at Associate's level or higher",
y = "Percent devices completely at home on weekdays pre-shelter-in-place",
title = "San Jose: Social Distancing and Education Pre Shelter-in-Place"
)
educ_model2 <- lm(`% Completely at Home Pre Shelter` ~ `percent less than associates`, sj_education_by_block)
summary(educ_model2)
##
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `percent less than associates`,
## data = sj_education_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5796 -2.6958 -0.1566 2.2967 28.8071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.108538 0.497667 34.377 < 2e-16 ***
## `percent less than associates` 0.074211 0.008844 8.391 3.86e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.078 on 567 degrees of freedom
## Multiple R-squared: 0.1105, Adjusted R-squared: 0.1089
## F-statistic: 70.4 on 1 and 567 DF, p-value: 3.863e-16
Internet:
sj_internet_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B28002)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, "subscription", "type", "additional"), sep = "!!") %>%
filter(is.na(subscription) | (type == "Broadband such as cable, fiber optic or DSL") & is.na(additional)) %>%
mutate(type = replace_na(type, "total_num")) %>%
dplyr::select(blockgroup, type, estimate) %>%
spread(key = type, value = estimate) %>%
mutate(`percent high speed` = `Broadband such as cable, fiber optic or DSL`*100/total_num, `percent no high speed` = 100-`percent high speed`) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
# plotting
sj_internet_by_block %>%
ggplot(aes(
x = `percent no high speed`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households without broadband such as cable, fiber optic or DSL",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and High Speed Internet"
)
internet_model <- lm(`% Completely at Home` ~ `percent no high speed`, sj_internet_by_block)
summary(internet_model)
##
## Call:
## lm(formula = `% Completely at Home` ~ `percent no high speed`,
## data = sj_internet_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.924 -3.971 0.406 4.550 21.988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.98606 0.62413 88.1 <2e-16 ***
## `percent no high speed` -0.28052 0.02751 -10.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.798 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1552, Adjusted R-squared: 0.1537
## F-statistic: 104 on 1 and 566 DF, p-value: < 2.2e-16
# compare to pre-shelter-in-place behavior
sj_internet_by_block %>%
ggplot(aes(
x = `percent no high speed`,
y = `% Completely at Home Pre Shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households without broadband such as cable, fiber optic or DSL",
y = "Percent devices completely at home on weekdays pre-shelter-in-place",
title = "San Jose: Social Distancing and High Speed Internet Pre Shelter-in-Place"
)
internet_model2 <- lm(`% Completely at Home Pre Shelter` ~ `percent no high speed`, sj_internet_by_block)
summary(internet_model2)
##
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `percent no high speed`,
## data = sj_internet_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.1874 -2.7840 0.0996 2.6369 27.4202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.25181 0.33488 57.489 < 2e-16 ***
## `percent no high speed` 0.09226 0.01476 6.251 8.02e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.184 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.06458, Adjusted R-squared: 0.06293
## F-statistic: 39.08 on 1 and 566 DF, p-value: 8.023e-10
Do another multiple regression analysis, this time with education and income
educ_income_model <- lm(sj_ami_by_block$`% Completely at Home` ~ sj_ami_by_block$`% under 100,000` + sj_education_by_block$`percent less than associates`)
summary(educ_income_model)
##
## Call:
## lm(formula = sj_ami_by_block$`% Completely at Home` ~ sj_ami_by_block$`% under 100,000` +
## sj_education_by_block$`percent less than associates`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.409 -3.857 0.852 4.615 20.190
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 62.33345 0.92253
## sj_ami_by_block$`% under 100,000` -0.15299 0.02129
## sj_education_by_block$`percent less than associates` -0.09991 0.02130
## t value Pr(>|t|)
## (Intercept) 67.568 < 2e-16 ***
## sj_ami_by_block$`% under 100,000` -7.187 2.10e-12 ***
## sj_education_by_block$`percent less than associates` -4.691 3.41e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.197 on 565 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2816, Adjusted R-squared: 0.2791
## F-statistic: 110.7 on 2 and 565 DF, p-value: < 2.2e-16
Experimentation with other variables and other ways of analyzing the social distancing data. First I look at a few other possible variables. I start with units in the structure.
# try getting other variables
# get data on units in structure
sj_units_in_structure_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B25024)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, "units"), sep = "!!") %>%
filter(!is.na(units)) %>%
spread(key = units, value = estimate) %>%
mutate(total_nums = `1, attached` + `1, detached` + `10 to 19` + `2` + `20 to 49`+ `3 or 4` + `5 to 9`+ `50 or more`+ `Boat, RV, van, etc.`+ `Mobile home`, `percent 20 or more` = (`20 to 49`+`50 or more`)* 100/ total_nums, `percent 1 only` = (`1, attached` + `1, detached`)*100/total_nums, `percent > 1` = 100 - `percent 1 only`) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
# plot
sj_units_in_structure_by_block %>%
ggplot(aes(
x = `percent 20 or more`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of structures with more than 20 units",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and 20 or More Units Per Structure"
)
summary(lm(`% Completely at Home` ~ `percent 20 or more`, sj_units_in_structure_by_block))
##
## Call:
## lm(formula = `% Completely at Home` ~ `percent 20 or more`, data = sj_units_in_structure_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.335 -4.843 0.201 5.190 25.765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.93469 0.40940 121.971 <2e-16 ***
## `percent 20 or more` -0.03712 0.02052 -1.809 0.071 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.459 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.005748, Adjusted R-squared: 0.003992
## F-statistic: 3.272 on 1 and 566 DF, p-value: 0.07099
sj_units_in_structure_by_block %>%
ggplot(aes(
x = `percent 1 only`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of structures with only one unit",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Only 1 Unit Per Structure"
)
summary(lm(`% Completely at Home` ~ `percent 1 only`, sj_units_in_structure_by_block))
##
## Call:
## lm(formula = `% Completely at Home` ~ `percent 1 only`, data = sj_units_in_structure_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.261 -4.405 0.224 5.078 24.608
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.75451 0.88920 50.331 < 2e-16 ***
## `percent 1 only` 0.06648 0.01132 5.872 7.33e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.237 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.05743, Adjusted R-squared: 0.05576
## F-statistic: 34.48 on 1 and 566 DF, p-value: 7.328e-09
Household type and size:
# load data on household type and size
sj_house_size_type_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B11016)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, "type", "size"), sep = "!!") %>%
filter(!is.na(type))
# household type
sj_house_type_by_block <- sj_house_size_type_by_block %>%
filter(is.na(size)) %>%
dplyr::select(-size) %>%
spread(key = type, value = estimate) %>%
mutate(`total households` = `Family households` + `Nonfamily households`, `percent nonfamily` = `Nonfamily households` / `total households`) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
sj_house_type_by_block %>%
ggplot(aes(
x = `percent nonfamily`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent nonfamily households",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Household Type"
)
summary(lm(`% Completely at Home` ~ `percent nonfamily`, sj_house_type_by_block))
##
## Call:
## lm(formula = `% Completely at Home` ~ `percent nonfamily`, data = sj_house_type_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.208 -4.597 -0.052 5.089 23.828
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.2308 0.6403 81.578 < 2e-16 ***
## `percent nonfamily` -11.0270 2.2223 -4.962 9.24e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.305 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.04169, Adjusted R-squared: 0.03999
## F-statistic: 24.62 on 1 and 566 DF, p-value: 9.245e-07
# household size
sj_house_size_by_block <- sj_house_size_type_by_block %>%
filter(!is.na(size)) %>%
dplyr::select(-type) %>%
group_by(blockgroup, size) %>%
summarize(`total of this size` = sum(estimate)) %>%
spread(key = size, value = `total of this size`) %>%
mutate(total_nums = `1-person household` + `2-person household` + `3-person household` + `4-person household` + `5-person household`+ `6-person household` + `7-or-more person household`, `percent 5 or more` = (`5-person household`+`6-person household` + `7-or-more person household`)* 100/ total_nums, `percent 1 or 2 only` = (`1-person household` + `2-person household`)*100/total_nums) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
sj_house_size_by_block %>%
ggplot(aes(
x = `percent 5 or more`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with 5 or more people",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Households With 5 or More"
)
summary(lm(`% Completely at Home` ~ `percent 5 or more`, sj_house_size_by_block))
##
## Call:
## lm(formula = `% Completely at Home` ~ `percent 5 or more`, data = sj_house_size_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.588 -4.419 0.605 4.958 25.419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.32314 0.56617 90.650 < 2e-16 ***
## `percent 5 or more` -0.10054 0.02541 -3.957 8.55e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.369 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.02692, Adjusted R-squared: 0.0252
## F-statistic: 15.66 on 1 and 566 DF, p-value: 8.545e-05
sj_house_size_by_block %>%
ggplot(aes(
x = `percent 1 or 2 only`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with 1 or 2 people",
y = "Percent devices completely at home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Small Household Size"
)
summary(lm(`% Completely at Home` ~ `percent 1 or 2 only`, sj_house_size_by_block))
##
## Call:
## lm(formula = `% Completely at Home` ~ `percent 1 or 2 only`,
## data = sj_house_size_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.673 -4.811 0.223 5.383 25.408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.54506 0.98251 51.445 <2e-16 ***
## `percent 1 or 2 only` -0.02185 0.02043 -1.069 0.285
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.475 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.002016, Adjusted R-squared: 0.0002529
## F-statistic: 1.143 on 1 and 566 DF, p-value: 0.2854
Next I consider different ways of looking at the social distancing data. First I try distance traveled.
# try other ways of looking at the social distancing data
# first look at total distance traveled
sj_sd_distance <- sj_socialdistancing %>%
filter(date > shelter_start) %>%
group_by(origin_census_block_group) %>%
summarize(total_dist_traveled = sum(distance_traveled_from_home), device_count = sum(device_count)) %>%
mutate(total_dist_per_device = total_dist_traveled / device_count)
sj_distance_testing <- left_join(sj_ami_by_block, sj_sd_distance, by = c("blockgroup" = "origin_census_block_group")) %>% left_join(sj_age_by_block %>% select(blockgroup, `percent less than 30`))
sj_distance_testing %>% filter(total_dist_per_device < 500) %>%
ggplot(aes(
x = `% under 75,000`,
y = total_dist_per_device
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
y = "Average distance traveled per device during weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Income, Distance Metric"
)
This is very skewed by outliers, and probably not a useful metric.
Now I consider including devices that traveled <1km as staying at (or near) home.
sj_sd_range <- sj_socialdistancing %>%
filter(weekend == F) %>%
filter(date > shelter_start) %>%
mutate(travel_buckets_split = lapply(bucketed_distance_traveled, function(x) strsplit(x, "<1000")[[1]][2]), less_than_1km = lapply(travel_buckets_split, function(x) strsplit(x, ":")[[1]][2]), less_than_1km = lapply(less_than_1km, function(x) strsplit(x, ",")[[1]][1])) %>%
mutate(less_than_1km = lapply(less_than_1km, function(x) str_remove(x, "[}]"))) %>% # clean a bit more
mutate(less_than_1km = as.numeric(less_than_1km), less_than_1km = replace_na(less_than_1km, 0)) %>%
mutate(home_or_1km = completely_home_device_count + less_than_1km) %>%
group_by(origin_census_block_group) %>%
summarize(home_or_1km = sum(home_or_1km), device_count = sum(device_count)) %>%
mutate(`% Within 1km of Home` = (home_or_1km/device_count*100) %>% round(1))
# join this with other data
sj_1km_testing <- left_join(sj_ami_by_block, sj_sd_range, by = c("blockgroup" = "origin_census_block_group")) %>%
left_join(sj_occupants_per_room_by_block %>% dplyr::select(`percent 1 or more`, blockgroup)) %>%
left_join(sj_age_by_block %>% dplyr::select(`percent less than 30`, blockgroup)) %>%
left_join(sj_lang_by_block %>% dplyr::select(`% speaking english < well`, blockgroup))
# plot with income
sj_1km_testing %>%
ggplot(aes(
x = `% under 75,000`,
y = `% Within 1km of Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
y = "% of devices within 1km of home, weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Income, 1km Range"
)
summary(lm(`% Within 1km of Home` ~ `% under 75,000`, sj_1km_testing))
##
## Call:
## lm(formula = `% Within 1km of Home` ~ `% under 75,000`, data = sj_1km_testing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.458 -4.266 0.401 4.993 24.159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67.1812 0.7578 88.65 <2e-16 ***
## `% under 75,000` -0.2343 0.0181 -12.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.839 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2285, Adjusted R-squared: 0.2271
## F-statistic: 167.6 on 1 and 566 DF, p-value: < 2.2e-16
# plot with age
sj_1km_testing %>%
ggplot(aes(
x = `percent less than 30`,
y = `% Within 1km of Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of people younger than 30",
y = "Percent of devices within 1km of home during weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Age, 1km Range"
)
summary(lm(`% Within 1km of Home` ~ `percent less than 30`, sj_1km_testing))
##
## Call:
## lm(formula = `% Within 1km of Home` ~ `percent less than 30`,
## data = sj_1km_testing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.271 -4.766 0.304 5.309 25.413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68.78200 1.57767 43.597 < 2e-16 ***
## `percent less than 30` -0.27326 0.03999 -6.833 2.15e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.651 on 567 degrees of freedom
## Multiple R-squared: 0.07608, Adjusted R-squared: 0.07445
## F-statistic: 46.69 on 1 and 567 DF, p-value: 2.154e-11
# run multiple regression model
modeltest2 <- lm(sj_1km_testing$`% Within 1km of Home` ~ sj_1km_testing$`% under 75,000` + sj_1km_testing$`percent less than 30` + sj_1km_testing$`% speaking english < well` + sj_1km_testing$`percent 1 or more`)
summary(modeltest2)
##
## Call:
## lm(formula = sj_1km_testing$`% Within 1km of Home` ~ sj_1km_testing$`% under 75,000` +
## sj_1km_testing$`percent less than 30` + sj_1km_testing$`% speaking english < well` +
## sj_1km_testing$`percent 1 or more`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.972 -4.599 0.793 4.765 23.400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.77102 1.67952 42.733 < 2e-16
## sj_1km_testing$`% under 75,000` -0.20967 0.02267 -9.249 < 2e-16
## sj_1km_testing$`percent less than 30` -0.14057 0.04466 -3.148 0.00173
## sj_1km_testing$`% speaking english < well` -0.02227 0.04822 -0.462 0.64432
## sj_1km_testing$`percent 1 or more` 0.01303 0.04897 0.266 0.79024
##
## (Intercept) ***
## sj_1km_testing$`% under 75,000` ***
## sj_1km_testing$`percent less than 30` **
## sj_1km_testing$`% speaking english < well`
## sj_1km_testing$`percent 1 or more`
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.77 on 563 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.246, Adjusted R-squared: 0.2406
## F-statistic: 45.92 on 4 and 563 DF, p-value: < 2.2e-16
It looks like the fit of these selected variables is slightly better for the social distancing data based on not traveling farther than 1km.
Now I also consider “non-work” behavior.
sj_nonworking_by_block <- sj_socialdistancing %>%
filter(weekend == F) %>%
filter(date > shelter_start) %>%
mutate(nonworking = device_count - completely_home_device_count - part_time_work_behavior_devices - full_time_work_behavior_devices) %>%
group_by(origin_census_block_group) %>%
summarize(nonworking_count = sum(nonworking), total_device = sum(device_count)) %>%
mutate(nonworking_percent = nonworking_count*100 / total_device, percent_only_work_home = 100-nonworking_percent) %>%
left_join(sj_1km_testing %>% dplyr::select(`% under 75,000`, `percent less than 30`, `% speaking english < well`, `percent 1 or more`, blockgroup), by = c("origin_census_block_group" = "blockgroup"))
# plot against age and income
sj_nonworking_by_block %>%
ggplot(aes(
x = `% under 75,000`,
y = percent_only_work_home
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
y = "Percent of devices completely at home or leaving home for non-work purposes during weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Income, Nonworking Behavior"
)
summary(lm(percent_only_work_home ~ `% under 75,000`, sj_nonworking_by_block))
##
## Call:
## lm(formula = percent_only_work_home ~ `% under 75,000`, data = sj_nonworking_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.2184 -3.1028 0.1581 3.1343 22.1765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.53383 0.49714 143.89 <2e-16 ***
## `% under 75,000` -0.12775 0.01187 -10.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.142 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1698, Adjusted R-squared: 0.1683
## F-statistic: 115.8 on 1 and 566 DF, p-value: < 2.2e-16
sj_nonworking_by_block %>%
ggplot(aes(
x = `percent less than 30`,
y = percent_only_work_home
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of people younger than 30",
y = "Percent of devices completely at home or leaving home for non-work purposes during weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Age, Nonworking Behavior"
)
summary(lm(percent_only_work_home ~ `percent less than 30`, sj_nonworking_by_block))
##
## Call:
## lm(formula = percent_only_work_home ~ `percent less than 30`,
## data = sj_nonworking_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.7223 -3.4844 0.2204 3.5775 22.6311
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68.83712 1.02739 67.002 <2e-16 ***
## `percent less than 30` -0.05479 0.02604 -2.104 0.0358 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.633 on 567 degrees of freedom
## Multiple R-squared: 0.007747, Adjusted R-squared: 0.005997
## F-statistic: 4.427 on 1 and 567 DF, p-value: 0.03582
# multiple regression model
modeltest3 <- lm(sj_nonworking_by_block$nonworking_percent ~ sj_nonworking_by_block$`% under 75,000` + sj_nonworking_by_block$`percent less than 30` + sj_nonworking_by_block$`% speaking english < well` + sj_nonworking_by_block$`percent 1 or more`)
summary(modeltest3)
##
## Call:
## lm(formula = sj_nonworking_by_block$nonworking_percent ~ sj_nonworking_by_block$`% under 75,000` +
## sj_nonworking_by_block$`percent less than 30` + sj_nonworking_by_block$`% speaking english < well` +
## sj_nonworking_by_block$`percent 1 or more`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.9975 -3.2297 0.0295 3.1285 16.9204
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 30.89719 1.10085 28.067
## sj_nonworking_by_block$`% under 75,000` 0.10453 0.01486 7.035
## sj_nonworking_by_block$`percent less than 30` -0.07166 0.02927 -2.448
## sj_nonworking_by_block$`% speaking english < well` 0.04189 0.03161 1.326
## sj_nonworking_by_block$`percent 1 or more` 0.07375 0.03210 2.298
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## sj_nonworking_by_block$`% under 75,000` 5.82e-12 ***
## sj_nonworking_by_block$`percent less than 30` 0.0147 *
## sj_nonworking_by_block$`% speaking english < well` 0.1855
## sj_nonworking_by_block$`percent 1 or more` 0.0219 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.093 on 563 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.19, Adjusted R-squared: 0.1842
## F-statistic: 33.01 on 4 and 563 DF, p-value: < 2.2e-16
These variables do worse for the percent nonworking metric, which makes sense.